from termcolor import colored
from sklearn.tree import DecisionTreeClassifier
import missingno as msno
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import pickle
import pprint
from sklearn.ensemble import RandomForestRegressor
from pandas_profiling import ProfileReport
from dateutil import relativedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from statsmodels.regression.linear_model import OLS
plot_______ = False
plot_______ = True
def new_line():
print("\n-------------------------\n")
def RMSE(predictions):
return round(np.sqrt(((test_y - predictions)**2).mean()))
def cluping_rare_cases_in_one_catagory(x):
global df
x = df[x]
orignal = x.copy("deep")
xx = x.value_counts()
xx = xx[xx< 10].index.to_list()
x = x.replace(xx , "Rare cases")
if x.value_counts()[-1] < 8:
x[x == "Rare cases"] = x.mode()[0] # agar "Rare cases" vali catogery me 8 sy bhi kam values hon to un ko most common value sy replace kar do
if x.nunique() == 1:
new_line()
# to_print = f"The column <{x.name}> have only one unique value, We droped it from the data."
to_print = f"The column <{x.name}> have imbalanced, so we droped it, it has {orignal.nunique()} unique values, and most commont value frequency ratio is {(orignal == orignal.mode()[0]).mean()}"
print(colored(to_print, 'red'))
# return orignal
df.drop(columns=x.name, inplace=True)
return None
return x
def plot_numerical_columns(col_name):
if not plot_______:
return None
# Histogram
df[col_name].plot(kind="hist", figsize=(13,8));
plt.title(col_name, size=18);
plt.axhline(y=df[col_name].mean(), color='red');
plt.axhline(y=df[col_name].median(), color='green');
plt.legend(['Actual', 'Mean', 'Median']);
plt.show()
# Scatter plot
df[col_name].plot(figsize=(13,8));
plt.title(col_name, size=18);
plt.axhline(y=df[col_name].mean(), color='red');
plt.axhline(y=df[col_name].median(), color='green');
plt.legend(['Actual', 'Mean', 'Median']);
plt.show()
# scatter plot (sort by values), values Vs index
df[col_name].sort_values().reset_index(drop=True).plot(figsize=(13,8));
plt.title(col_name+" (SORTED)", size=18);
plt.axhline(y=df[col_name].mean(), color='red');
plt.axhline(y=df[col_name].median(), color='green');
plt.legend(['Actual', 'Mean', 'Median']);
plt.show()
# box plot
df[col_name].plot(kind="box", figsize=(13,8))
plt.title(col_name, size=18);
plt.xlabel("");
plt.show()
def plot_date_columns(col_name):
if not plot_______:
return None
df[col_name].plot(figsize=(15,7), grid=True);
plt.xlabel("Index", size=14);
plt.ylabel("Date", size=14);
plt.title(col_name + " Graph", size=18);
plt.show();
df[col_name].sort_values().reset_index(drop=True).plot(figsize=(15,7), grid=True);
plt.xlabel("Index (sorted)", size=14);
plt.ylabel("Year", size=14);
plt.title(col_name + " Graph", size=18);
plt.show();
(df[col_name].dt.year.value_counts(sort=False).sort_index() / len(df) * 100).plot(kind="bar", figsize=(15,7), grid=True);
plt.xlabel("Year", size=14);
plt.ylabel("Ratio (1-100)", size=14);
plt.title(col_name + " year Frequency Graph", size=18);
plt.show();
(df[col_name].dt.month.value_counts().sort_index()/len(df) * 100).plot(kind="bar", figsize=(15,7), grid=True);
plt.xlabel("Month", size=14);
plt.ylabel("Ratio (1-100)", size=14);
plt.title(col_name + " month Frequency Graph", size=18);
plt.show();
(df[col_name].dt.day.value_counts().sort_index()/len(df) * 100).plot(kind="bar", figsize=(15,7), grid=True);
plt.xlabel("Day", size=14);
plt.ylabel("Ratio (1-100)", size=14);
plt.title(col_name + " Day Frequency Graph", size=18);
plt.show();
def plot_catagorical_columns(cat_variable):
if not plot_______:
return None
(df[cat_variable].value_counts() / len(df) * 100).plot.bar(figsize=(15,6), grid=True);
plt.title(cat_variable, size=18, color='r');
plt.xlabel("Catagory", size=14, color='r');
plt.ylabel("Ratio (1-100)", size=14, color='r');
plt.show()
def data_shape():
return f"The Data have:\n\t{df.shape[0]} rows\n\t{df.shape[1]} columns\n"
#===
# df = pd.read_csv("data.csv", date_parser=True)
# df = pd.read_csv("df_only_selected_columns_using_PCA.csv", date_parser=True)
# target_variable = "ACTUAL_WORTH"
# df = pd.concat([
# df.select_dtypes("number").iloc[:, :3],
# df.select_dtypes("O").iloc[:, :3],
# df.select_dtypes(exclude=["number", "O"]),
# df[[target_variable]]], 1)
# target_variable = "AREA_NAME_EN"
# df = pd.read_csv("cleaned_data.csv", date_parser=True)
# target_variable = "SalePrice"
train = pd.read_csv("/home/amir/Downloads/train.csv")
test = pd.read_csv("/home/amir/Downloads/test.csv")
target_variable = "SalePrice"
train_y = train[target_variable]
train = train.drop(columns=target_variable)
df = pd.concat([train, test])
df[target_variable] = train_y.to_list() + [None]*len(test)
#===
new_line()
print(data_shape())
#===
new_line()
print(f"Columns types distribution:\n\n{df.dtypes.value_counts()}\n")
df.dtypes.value_counts().plot(kind='barh', figsize=(10, 2), grid=True, title="Variable types Count Graph");
plt.xlabel("Count");
plt.show()
#===
f = df[target_variable].isna().sum()
if f:
new_line()
to_print = f"There are {f} NAs in target values, we droped those rows"
print(colored(to_print, 'red'))
df = df[df[target_variable].notna()]
del f
#---------------------------------------------------
# df.select_dtypes("O").columns[:5]
# D = df.select_dtypes(exclude="O")
# D2 = df.select_dtypes("O").iloc[:,:5]
# df = pd.concat([D, D2], 1)
# profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)
# profile.to_file("your_report.html")
#---------------------------------------- NA
a = df.isna().sum().where(lambda x:x>0).dropna()
if a.size:
new_line()
to_print = f"There are {len(a)} (out of {df.shape[1]}, [{round(len(a)/df.shape[1]*100)}%]) columns that contains 1 or more NA."
print(colored(to_print, 'red'))
for i in a.index:
df[i+"_NA_indicator"] = df[i].isna().replace({True : "Missing", False : "Not missing"})
new_line()
to_print = f"{a.size} NA_indicator variables added to the data\n"
print(colored(to_print, 'red'))
print("========= NA Graphs =========\n")
msno.matrix(df);
plt.title("NA Graph");
plt.show()
new_line()
sns.heatmap(df.isnull(), cbar=False);
plt.title("NA Graph");
plt.show()
#===
a = a.sort_values()/len(df)*100
if (a == 100).sum():
new_line()
df.drop(columns=a[a==100].index, inplace=True)
to_print = f"There are {(a == 100).sum()} columns that are all Missing values, so we droped those.\nNow {data_shape()}\n\nDropped columns names:"
print(colored(to_print, 'red'))
for i in a[a==100].index:
print("\t",i)
a = a[a != 100]
#===
x = df[a.index].dtypes.value_counts()
if x.size:
new_line()
print(f"NA columns data type Distribution:\n\n{x}")
del x
#===
new_line()
if a.size:
print(f"NaN Ratio (0-100)\n\n{a}")
else:
print(colored("Now There is no NaN value in our Data", 'red'))
#===
# ----------------------------------------------- Imputing Missing values
# ------------------------------------ Numerical columns imputing
if df.select_dtypes("number").isna().sum().sum():
new_line()
print(f'(Before Missing values treatment)\nThere are {df.isna().sum().sum()} Missing values:\n\t{df.select_dtypes("O").isna().sum().sum()} in catagorical variables\n\t{df.select_dtypes("number").isna().sum().sum()} in numerical columns\n\t{df.select_dtypes(exclude=["O", "number"]).isna().sum().sum()} in others')
from sklearn.impute import KNNImputer
df_not_a_number = df.select_dtypes(exclude="number")
df_number = df.select_dtypes("number")
del df
imputer = KNNImputer(n_neighbors=4, weights="uniform")
imputed = imputer.fit_transform(df_number)
df_number = pd.DataFrame(imputed, columns=df_number.columns)
df = pd.concat([df_not_a_number.reset_index(drop=True), df_number.reset_index(drop=True)], axis=1)
del df_not_a_number
del df_number
print(f'\n(After filling numeric missing values)\nThere are {df.isna().sum().sum()} Missing values:\n\t{df.select_dtypes("O").isna().sum().sum()} in catagorical variables\n\t{df.select_dtypes("number").isna().sum().sum()} in numerical columns\n\t{df.select_dtypes(exclude=["O", "number"]).isna().sum().sum()} in others')
#===
# -------------------------------- Catagoriacal variables imputating
vars_to_fill = df.select_dtypes("O").isna().mean().where(lambda x:x>0).dropna().sort_values(ascending=True)
if vars_to_fill.size:
for col in vars_to_fill.index:
tr = pd.concat([df[[col]], df.loc[:,df.isna().sum() == 0]], 1)
tr_y = tr[col]
tr_X = tr.drop(columns=col)
tr_T = tr_X.select_dtypes("number")
cat_cols = pd.get_dummies(tr_X.select_dtypes(exclude="number"), prefix_sep="__")
tr_T[cat_cols.columns.to_list()] = cat_cols
tr_T[col] = tr_y
tr = tr_T.copy("deep")
train = tr[tr[col].notna()]
test = tr[tr[col].isna()]
train_y = train[col]
train_X = train.drop(columns=col)
test_X = test.drop(columns=col)
clf = DecisionTreeClassifier().fit(train_X, train_y)
test_y = clf.predict(test_X)
df.loc[df[col].isna(), col] = test_y
new_line()
print(f"Missing values imputed, Now there are {df.isna().sum().sum()} Missing values")
# ----------------------------------------------- END Imputing Missing values
# --------------------------------------------------------- Unique values
only_one_unique_value = df.nunique().where(lambda x:x == 1).dropna()
if only_one_unique_value.size:
new_line()
df.drop(columns=only_one_unique_value.index, inplace=True)
last_ = ("", "it") if only_one_unique_value.size == 1 else ("s", "those")
to_print = f"There are {only_one_unique_value.size} variable{last_[0]} That have only one unique value, so we droped {last_[1]}.\nDropped column{last_[0]} name{last_[0]} (in order):"
print(colored(to_print, 'red'))
for i in only_one_unique_value.index.sort_values():
print(i)
new_line()
print(f"\nNow {data_shape()}")
del only_one_unique_value
# #===
all_values_are_unique = df.apply(lambda x:x.is_unique).where(lambda x:x==True).dropna()
if all_values_are_unique.size:
new_line()
df.drop(columns=all_values_are_unique.index, inplace=True)
last_ = ("", "it") if all_values_are_unique.size == 1 else ("s", "those")
to_print = f"There are {all_values_are_unique.size} column{last_[0]} that have all unique values, so no value repeatation, we droped {last_[1]} column{last_[0]}.\nDropped column{last_[0]} name{last_[0]} are:\n"
print(colored(to_print, 'red'))
for i in all_values_are_unique.index:
print("\t", i)
new_line()
print(f"Now {data_shape()}")
del all_values_are_unique
#===
date_columns = []
def DTYPES():
global date_columns
catagorical_columns = df.head().select_dtypes("O").columns
numerical_columns = df.head().select_dtypes("number").columns
date_columns = []
for i in catagorical_columns:
try:
df[i] = pd.to_datetime(df[i])
date_columns.append(i)
except:
pass
catagorical_columns = catagorical_columns.drop(date_columns)
if date_columns:
date_columns = pd.Index(date_columns)
#===
if not catagorical_columns.append(numerical_columns).append(date_columns).is_unique:
new_line()
print(colored("Some column/s repated in > 1 dtypes\n", 'red'))
dtypes = pd.DataFrame({"Column" : catagorical_columns.append(numerical_columns).append(date_columns),
"dtype" : ['O']*len(catagorical_columns) + ['Number']*len(numerical_columns) + ['Date']*len(date_columns)})
print(dtypes[dtypes.Column.isin(list(dtypes[dtypes.Column.duplicated()].Column.values))].to_string())
#===
x = df.columns.difference(
catagorical_columns.append(numerical_columns).append(date_columns)
)
if x.size:
new_line()
print(colored("Some columns not included in any existing catagory, those:\n", 'red'))
for i in x:
print(f"\t<{i}, with dtype of <{df[i].dtype}>")
#===
dtypes = pd.DataFrame({"Column" : catagorical_columns.append(numerical_columns).append(date_columns),
"dtype" : ['Object']*len(catagorical_columns) + ['Number']*len(numerical_columns) + ['Date']*len(date_columns)})
return dtypes
#===
dtypes = DTYPES()
# ----------------------------------------------------------------------- Feature enginearing
# ======= Adding date columns
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> add polynomial, sqrt, tree, log features
def add_new_date_cols(x, suffix):
d = {}
d[suffix + '_week_normalized'] = x.dt.week / 52
d[suffix + '_week_str'] = '"' + x.dt.week.apply(lambda x:np.nan if np.isnan(x) else str(x).replace(".0", "")) + '"'
d[suffix + '_year_after_min_year'] = x.dt.year - x.dt.year.min()
d[suffix + '_year_str'] = '"' + x.dt.year.apply(lambda x:np.nan if np.isnan(x) else str(x).replace(".0", "")) + '"'
d[suffix + '_day_name'] = x.dt.day_name()
d[suffix + '_day_after_min_date_str'] = '"' + (x - x.min()).apply(lambda x: str(x).split()[0]) + '"'
d[suffix + '_day_normalized'] = x.dt.day / 31
d[suffix + '_hour_normalized'] = x.dt.hour / 24
d[suffix + '_hour_str'] = '"' + x.dt.hour.apply(lambda x:np.nan if np.isnan(x) else str(x).replace(".0", "")) + '"'
d[suffix + '_month_name'] = x.dt.month_name()
d[suffix + '_month_normalized'] = x.dt.month/12
for k,v in d.items():
if v.nunique() > 1:
df[k] = v
return df.drop(columns=x.name)
# return df
len_df_before_adding_date_vars = df.shape[1]
for date_col in date_columns:
df = add_new_date_cols(df[date_col], date_col)
len_df_after_adding_date_vars = df.shape[1]
if len_df_after_adding_date_vars > len_df_before_adding_date_vars:
new_line()
to_print = f"Added {len_df_after_adding_date_vars - len_df_before_adding_date_vars} date Features"
print(colored(to_print, 'red'))
# ======= type casting of numerical variable (those who have < 4% unique values) to catagorical variables
f = (df.select_dtypes("number").nunique() / len(df) * 100).where(lambda x:x<4).dropna().index
if f.size:
len_df_before_adding_date_vars = df.shape[1]
for col_num_to_str in f:
df[col_num_to_str+"_str"] = '"' + df[col_num_to_str].astype(str) + '"'
len_df_after_adding_date_vars = df.shape[1]
new_line()
to_print = f"Added {len_df_after_adding_date_vars - len_df_before_adding_date_vars} String Features (Extracted from numerical variables)"
print(colored(to_print, 'red'))
# =======
for var in df.select_dtypes("O").columns:
m = cluping_rare_cases_in_one_catagory(var)
if isinstance(m, pd.core.series.Series):
df[var] = m
new_line()
xx = (df == 'Rare cases').sum().sort_values().where(lambda x:x>0).dropna()
xx = pd.DataFrame({"Count" : xx,
"Ratio" : round(xx/len(df)*100, 4)})
print(f"<Rare case> catagory:\n{xx.to_string()}")
# ----------------------------------------------------------------------- END (Feature enginearing)
dtypes = DTYPES()
# ---------------------------------------------------- Correlation plot
new_line()
cor_df = df.select_dtypes('number').corr().abs()
mask = np.triu(np.ones_like(cor_df, dtype=bool));
f, ax = plt.subplots(figsize=(17, 10));
cmap = sns.color_palette("viridis", as_cmap=True);
plot_ = sns.heatmap(cor_df, mask=mask, cmap=cmap, vmax=.3, square=True, linewidths=.5, cbar_kws={"shrink": .5});
plot_.axes.set_title("abs (Correlation) plot",fontsize=25);
plt.show()
# ---------------------------------------------------------------------
#===
# m = 0
for row in dtypes.iterrows():
# m += 1
# if m == 3:
# break
column_name, type_ = row[1]
x = df[column_name]
to_print = f"\n\n\n========================================= {column_name} =========================================\n\n"
print(colored(to_print, 'red'))
for col_ in df.columns:
if col_ == column_name:
continue
if df[col_].nunique() == df[column_name].nunique():
unique_combination = df[[col_, column_name]].drop_duplicates()
if unique_combination.apply(lambda x:x.is_unique).sum() == 2:
new_line()
to_print = f"This Columns is duplicate of <{col_}> column"
print(colored(to_print, 'red'))
# print(f"Column Type : {type_}")
print(f"Column Type : ", end="")
print(colored(type_, 'red'))
if x.isna().all():
new_line()
df.drop(columns=column_name, inplace=True)
print(colored("We dropped This column, because it is all Empty", 'red'))
continue
if type_ in ["O", "Date"]:
if x.is_unique:
new_line()
df.drop(columns=column_name, inplace=True)
to_print = f"We dropped This column, because it's a {type_} columns, and it's all values are unique"
print(colored(to_print, 'red'))
continue
if x.nunique() == 1:
new_line()
df.drop(columns=column_name, inplace=True)
print(colored("We dropped This column, because There is only one unique value", 'red'))
continue
if type_ == "Number":
local_cor = cor_df[column_name].drop(column_name).reset_index()
local_cor = local_cor.reindex(local_cor[column_name].abs().sort_values().index)
if local_cor[column_name].max() == 1:
new_line()
to_print = f"This column is perfactly correlated with column <{local_cor[local_cor[column_name] == 1]['index'].values[0]}, so remove one of them"
print(colored(to_print, 'red'))
new_line()
xm = local_cor[-3:].rename(columns={'index' : 'Column name', column_name : 'Correlation'}).reset_index(drop=True)
xm.index = xm['Column name']
xm.drop(columns="Column name", inplace=True);
xm.plot(kind='barh', grid=True, figsize=(10,1.5));
plt.title("Most 3 correlated features with this columns (sorted)", size=14);
plt.xlabel("Correlation", size=12);
plt.show();
new_line()
skewness = x.skew(skipna = True)
if abs(skewness) < 0.5:
print(f"The data is fairly symmetrical (skewness is: {skewness})")
elif abs(skewness) < 1:
print(f"The data are moderately skewed (skewness is: {skewness})")
else:
to_print = f"The data are highly skewed (skewness is: {skewness})\nNote: When skewness exceed |1| we called it highly skewed"
print(colored(to_print, 'red'))
# f = x.describe()
# f['Nunique'] = x.nunique()
# f['Nunique ratio'] = f.loc["Nunique"] / f.loc["count"] * 100
# f['Outlies count'] = (((x - x.mean())/x.std()).abs() > 3).sum()
# f['Outlies ratio'] = f.loc["Outlies count"] / f.loc["count"] * 100
# f['Nagative values count'] = (x < 0).sum()
# f['Nagative values ratio'] = f['Nagative values count'] / f['count'] * 100
ff = [x.count(), x.isna().sum(), x.mean(), x.std(), x.min()]
ff += x.quantile([.25,.5,.75]).to_list()
ff += [x.max(), x.nunique(), (((x - x.mean())/x.std()).abs() > 3).sum(), (x < 0).sum(), (x == 0).sum()]
f = pd.DataFrame(ff, index=['Count', 'NA', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max', 'Nunique', 'Outlies', 'Nagetive', 'Zeros'], columns=['Count'])
f['Ratio'] = f.Count / x.count() * 100
f.loc['Mean' : 'Max', 'Ratio'] = None
new_line()
print(f.round(2).to_string())
plot_numerical_columns(column_name)
elif type_ == "Object":
# f = x.describe()
# f = x.agg(['count', pd.Series.nunique])
# f['len'] = len(x)
# f['Na count'] = x.isna().sum()
# f['Na ratio'] = f['Na count'] / f['count'] * 100
# f['Most frequent'] = x.mode().values[0]
# f['Most frequent count'] = (x == f['Most frequent']).sum()
# f['Most frequent ratio'] = f['Most frequent count'] / f['count'] * 100
# f['Least frequent'] = x.value_counts().tail(1).index[0]
# f['Least frequent count'] = (x == f['Least frequent']).sum()
# f['Least frequent ratio'] = f['Least frequent count'] / f['count'] * 100
# f['Values occured only once count'] = x.value_counts().where(lambda x:x==1).dropna().size
# f['Values occured only once Ratio'] = f['Values occured only once count'] / x.count() * 100
l = x.count(), x.nunique(), len(x), x.isna().sum(), (x == x.mode().values[0]).sum(), (x == x.value_counts().tail(1).index[0]).sum(), x.value_counts().where(lambda x:x==1).dropna().size
f = pd.DataFrame(l, index=['Count', 'Nunique', 'Len', 'NA', 'Most frequent', 'Least frequent', 'Values occured only once'], columns=['Counts'])
f['Ratio'] = (f.Counts / x.count() * 100).round(4)
f.loc[['Len'], 'Ratio'] = None
new_line()
print(f.to_string())
if x.str.lower().nunique() != x.nunique():
new_line()
to_print = f"Case issue\n\tin orignal variable There are {x.nunique()} unique values\n\tin lower verstion there are {x.str.lower().nunique()} unique values.\n"
print(colored(to_print, 'red'))
if x.str.strip().nunique() != x.nunique():
new_line()
to_print = f"Space issue\n\tin orignal variable There are {x.nunique()} unique values\n\tin striped verstion there are {x.str.strip().nunique()} unique values."
print(colored(to_print, 'red'))
plot_catagorical_columns(column_name)
elif type == "Date":
new_line()
rd = relativedelta.relativedelta( pd.to_datetime(x.max()), pd.to_datetime(x.min()))
to_print = f"Diffrenece between first and last date:\n\tYears : {rd.years}\n\tMonths: {rd.months}\n\tDays : {rd.days}"
print(colored(to_print, 'red'))
# f = pd.Series({'Count' : x.count(),
# 'Nunique count' : x.nunique(),
# 'Nunique ratio' : x.nunique() / x.count() * 100,
# 'Most frequent value' : str(x.mode()[0]),
# 'Least frequent value' : x.value_counts().tail(1).index[0]
# })
# f['Most frequent count'] = (x == f['Most frequent value']).sum()
# f['Most frequent ratio'] = f['Most frequent count'] / f['Count'] * 100
# f['Least frequent count'] = (x == f['Least frequent value']).sum()
# f['Least frequent ratio'] = f['Least frequent count'] / f['Count'] * 100
# f['Values occured only once count'] = x.value_counts().where(lambda x:x==1).dropna().size
# f['Values occured only once Ratio'] = f['Values occured only once count'] / x.count() * 100
ff = x.count(), x.nunique(), (x == x.mode().values[0]).sum(), (x == x.value_counts().tail(1).index[0]).sum(), x.value_counts().where(lambda x:x==1).dropna().size
f = pd.DataFrame(ff, index=["Count", 'Nunique', 'Most frequent values', 'Least frequent values', 'Values occured only once count'], columns=['Counts'])
f['Ratio'] = (f.Counts / x.count() * 100).round(4)
new_line()
print(f"\n{f.to_string()}")
f = set(np.arange(x.dt.year.min(),x.dt.year.max()+1)).difference(
x.dt.year.unique())
if f:
new_line()
print(colored("These Years (in order) are missing:\n", 'red'))
for i in f:
print("\t", i, end=", ")
f = set(np.arange(x.dt.month.min(),x.dt.month.max()+1)).difference(
x.dt.month.unique())
if f:
new_line()
print(colored("These Months (in order) are missing:\n", 'red'))
for i in f:
print("\t", i, end=", ")
f = set(np.arange(x.dt.day.min(),x.dt.day.max()+1)).difference(
x.dt.day.unique())
if f:
new_line()
print(colored("These Days (in order) are missing:\n", 'red'))
for i in f:
print("\t", i, end=", ")
new_line()
plot_date_columns(column_name)
# ================================================================================================================ Modeling
print("\n\n")
print("----------------------------------------------------------------------------------------------")
print("****************************************** Modeling ******************************************")
# Regression problem
if df[target_variable].dtype in [float, int]:
print("\n-------------------- This is Regression problem --------------------\n")
print("''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''")
df_T = df.select_dtypes("number")
cat_cols = pd.get_dummies(df.select_dtypes(exclude="number"), prefix_sep="__")
df_T[cat_cols.columns.to_list()] = cat_cols
df = df_T.copy("deep")
del df_T
del cat_cols
# ====
train_X, test_X, train_y, test_y = train_test_split(df.drop(columns=target_variable), df[target_variable])
# ====
# --------------------------------------------------------- Linear regression
to_print = "\n ------------------------------------- Linear Regression -------------------------------------\n"
print(colored(to_print, 'red'))
model_reg = OLS(train_y, train_X).fit()
summary = model_reg.summary()
summary_df = pd.DataFrame(summary.tables[1])
summary_df.columns = summary_df.iloc[0]
summary_df.drop(0, inplace=True)
summary_df.columns = summary_df.columns.astype(str)
summary_df.columns = ["Variable"] + summary_df.columns[1:].to_list()
for i in summary_df.columns[1:]:
summary_df[i] = summary_df[i].astype(str).astype(float)
summary_df.Variable = summary_df.Variable.astype(str)
summary_df['Indicator'] = summary_df['P>|t|'].apply(lambda x:"***" if x < 0.001 else "**" if x < 0.01 else "*" if x < 0.05 else "." if x < 0.1 else "")
summary_df = summary_df.sort_values("Variable").reset_index(drop=True)
summary_df.to_csv()
new_line()
print(colored("NOTE: This summary saved as <summary_OLS_1.csv>", 'red'))
new_line()
print(summary_df.to_string())
# ============================= Model statistic
predictions = model_reg.predict(test_X)
new_line()
print(colored(" --- Model statistic --- \n", 'red'))
print(f"R-squared : {round(model_reg.rsquared, 3)}")
print(f"Adj. R-squared : {round(model_reg.rsquared_adj, 3)}")
print(f"F-statistic : {round(model_reg.fvalue)}")
print(f"Prob (F-statistic): {model_reg.f_pvalue}")
print(f"No. Observations : {round(model_reg.nobs)}")
print(f"AIC : {round(model_reg.aic)}")
print(f"Df Residuals : {round(model_reg.df_resid)}")
print(f"BIC : {round(model_reg.bic)}")
print(f"RMSE (test) : {RMSE(predictions)}")
# ======
f = train_X.copy("deep")
f['Errors__'] = model_reg.resid
f = f.corr()['Errors__'].drop("Errors__").abs().sort_values().dropna().tail(1)
new_line()
print(f"Maximum correlation between Reseduals and any data columns is {f.values[0]}, with columns <{f.index[0]}>")
print(f"Mean of train reseduals: {model_reg.resid.mean()}")
del f
# ============================= END (Model statistic)
# --------------------------------------------------------- END Linear regression
# --------------------------------------------------------- Random Forest
print("\n ------------------------------------- Random Forest -------------------------------------\n")
rf = RandomForestRegressor(n_estimators = 200, oob_score=True)
model_rf = rf.fit(train_X, train_y);
predictions_rf = rf.predict(test_X)
new_line()
print(colored("RF model peramters:\n", 'red'))
pprint.pprint(model_rf.get_params())
new_line()
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(test_X, importances)]
featuresImportance = pd.Series(model_rf.feature_importances_, index=train_X.columns).sort_values(ascending=False)
if len(featuresImportance) > 30:
featuresImportance = featuresImportance.head(30)
featuresImportance.plot(figsize=(20,10), kind='bar', grid=True);
plt.title("RandomForest Feature importances Graph", size=18,color='red');
plt.xlabel("Features", size=14, color='red');
plt.ylabel("Importance", size=14, color='red');
plt.show();
del featuresImportance
new_line()
print(colored("--- Model statistic ---", 'red'))
# The coefficient of determination R^2 of the prediction.
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
print(f"R^2 (test) : {rf.score(test_X, test_y)}")
print(f"R^2 (train): {rf.score(train_X, train_y)}")
print(f"RMSE (test): {RMSE(predictions_rf)}")
print(f"oob score : {model_rf.oob_score_}")
f = test_X.copy("deep")
errors_rf = predictions_rf - test_y
f['Errors__'] = errors_rf
f = f.corr()['Errors__'].drop("Errors__").abs().sort_values().dropna().tail(1)
new_line()
print(f"Maximum correlation between Reseduals and any data columns is {f.values[0]}, with columns <{f.index[0]}>")
# --------------------------------------------------------- END Random Forest
elif df[target_variable].dtype == "O":
# Classififcation problem
if df[target_variable].nunique() == 2:
print("\n-------------------- This is Binary Classification problem --------------------\n")
print("''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''")
df = pd.concat([
df.select_dtypes(exclude = "O"),
pd.get_dummies(df.drop(columns=target_variable).select_dtypes("O")),
df[[target_variable]]
], 1)
train_X, test_X, train_y, test_y = train_test_split(df.drop(columns=target_variable), df[target_variable])
clf = LogisticRegression().fit(train_X, train_y)
predictions = clf.predict_proba(test_X)
predictions = pd.Series(predictions[:, 0])
lst = []
for thresh in np.linspace(predictions.min(), predictions.max(), 50)[1:]:
pred = predictions < thresh
pred.loc[pred == True] = clf.classes_[0]
pred.loc[pred == False] = clf.classes_[1]
test_y = test_y.reset_index(drop=True)
TN = ((pred == clf.classes_[0]) & (test_y == clf.classes_[0])).sum()
TP = ((pred == clf.classes_[1]) & (test_y == clf.classes_[1])).sum()
FN = ((pred == clf.classes_[0]) & (test_y == clf.classes_[1])).sum()
FP = ((pred == clf.classes_[1]) & (test_y == clf.classes_[0])).sum()
p = TP / (TP + FP)
r = TP / (TP + FN)
f = 2 * ((p * r) / (p+r))
lst.append((thresh, (pred == test_y).mean(), p, r , f))
d = pd.DataFrame(lst, columns=["Thresold", "Accurecy(0-1)", "Precision", "Recall", "F1"])
d = d.set_index("Thresold")
d.plot(grid=True, figsize=(18,7));
plt.title("Model performance at diffrent Thresolds", size=18, color='red');
plt.xlabel("Thresold", size=14, color='red');
plt.ylabel("");
plt.show()
else:
to_print = "\n-------------------- This is Multiclass Classification problem --------------------\n"
print(colored(to_print, 'red'))
print("'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''")
df.loc[:, df.select_dtypes("O").columns] = df.select_dtypes("O").apply(lambda x: pd.Series(LabelEncoder().fit_transform(x.astype(str))).astype(str))
train_X, test_X, train_y, test_y = train_test_split(df.drop(columns=target_variable), df[target_variable])
clf=RandomForestClassifier(n_estimators=1000).fit(train_X, train_y)
predictions = clf.predict(test_X)
feature_imp = pd.Series(clf.feature_importances_,index=train_X.columns).sort_values(ascending=False)
if feature_imp.size > 30:
feature_imp = feature_imp.head(30)
feature_imp.plot(kind='barh', figsize=(17,10), grid=True);
plt.title("Feature importances Graph", size=18, color='red');
plt.xlabel("Importance", size=14, color='red');
plt.ylabel("Feature", size=14, color='red');
plt.show()
# ====
f = (test_y, predictions)
f_int = (test_y.astype(int), predictions.astype(int))
print(f"accuracy_score: {metrics.accuracy_score(*f)}")
print(f"f1_score: {metrics.f1_score(*f_int)}")
metrics.plot_roc_curve(clf, test_X, test_y);
plt.title("ROC curve plot");
plt.show();
metrics.ConfusionMatrixDisplay(metrics.confusion_matrix(*f)); plt.show()
metrics.plot_confusion_matrix(clf, test_X, test_y);
plt.title("Confusion matrix");
plt.show()
metrics.plot_precision_recall_curve(clf, test_X, test_y);
plt.title("Precision recall curve");
plt.show()
# ================================================================================================================ END Modeling
------------------------- The Data have: 2919 rows 81 columns ------------------------- Columns types distribution: object 43 int64 26 float64 12 dtype: int64
------------------------- There are 1459 NAs in target values, we droped those rows ------------------------- There are 19 (out of 81, [23%]) columns that contains 1 or more NA. ------------------------- 19 NA_indicator variables added to the data ========= NA Graphs =========
-------------------------
------------------------- NA columns data type Distribution: object 16 float64 3 dtype: int64 ------------------------- NaN Ratio (0-100) Electrical 0.068493 MasVnrType 0.547945 MasVnrArea 0.547945 BsmtQual 2.534247 BsmtCond 2.534247 BsmtFinType1 2.534247 BsmtExposure 2.602740 BsmtFinType2 2.602740 GarageCond 5.547945 GarageQual 5.547945 GarageFinish 5.547945 GarageType 5.547945 GarageYrBlt 5.547945 LotFrontage 17.739726 FireplaceQu 47.260274 Fence 80.753425 Alley 93.767123 MiscFeature 96.301370 PoolQC 99.520548 dtype: float64 ------------------------- (Before Missing values treatment) There are 6965 Missing values: 6617 in catagorical variables 348 in numerical columns 0.0 in others (After filling numeric missing values) There are 6617 Missing values: 6617 in catagorical variables 0 in numerical columns 0.0 in others ------------------------- Missing values imputed, Now there are 0 Missing values ------------------------- There are 1 column that have all unique values, so no value repeatation, we droped it column. Dropped column name are: Id ------------------------- Now The Data have: 1460 rows 99 columns ------------------------- Added 18 String Features (Extracted from numerical variables) ------------------------- The column <Street> have imbalanced, so we droped it, it has 2 unique values, and most commont value frequency ratio is 0.9958904109589041 ------------------------- The column <Utilities> have imbalanced, so we droped it, it has 2 unique values, and most commont value frequency ratio is 0.9993150684931507 ------------------------- The column <Electrical_NA_indicator> have imbalanced, so we droped it, it has 2 unique values, and most commont value frequency ratio is 0.9993150684931507 ------------------------- The column <PoolQC_NA_indicator> have imbalanced, so we droped it, it has 2 unique values, and most commont value frequency ratio is 0.9952054794520548 ------------------------- The column <PoolArea_str> have imbalanced, so we droped it, it has 8 unique values, and most commont value frequency ratio is 0.9952054794520548 ------------------------- <Rare case> catagory: Count Ratio HouseStyle 8.0 0.5479 MasVnrType_NA_indicator 8.0 0.5479 MasVnrArea_NA_indicator 8.0 0.5479 FullBath_str 9.0 0.6164 Foundation 9.0 0.6164 RoofStyle 9.0 0.6164 Neighborhood 11.0 0.7534 Heating 14.0 0.9589 BedroomAbvGr_str 14.0 0.9589 Condition1 15.0 1.0274 Condition2 15.0 1.0274 RoofMatl 15.0 1.0274 Exterior2nd 17.0 1.1644 3SsnPorch_str 24.0 1.6438 LowQualFinSF_str 26.0 1.7808 SaleType 28.0 1.9178 MiscVal_str 41.0 2.8082 -------------------------
========================================= MSZoning ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 1151 78.8356 Least frequent 10 0.6849 Values occured only once 0 0.0000
========================================= Alley ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 832 56.9863 Least frequent 628 43.0137 Values occured only once 0 0.0000
========================================= LotShape ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 925 63.3562 Least frequent 10 0.6849 Values occured only once 0 0.0000
========================================= LandContour ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 1311 89.7945 Least frequent 36 2.4658 Values occured only once 0 0.0000
========================================= LotConfig ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 1056 72.3288 Least frequent 47 3.2192 Values occured only once 0 0.0000
========================================= LandSlope ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1382 94.6575 Least frequent 13 0.8904 Values occured only once 0 0.0000
========================================= Neighborhood ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 24 1.6438 Len 1460 NaN NA 0 0.0000 Most frequent 225 15.4110 Least frequent 11 0.7534 Values occured only once 0 0.0000
========================================= Condition1 ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 7 0.4795 Len 1460 NaN NA 0 0.0000 Most frequent 1260 86.3014 Least frequent 11 0.7534 Values occured only once 0 0.0000
========================================= Condition2 ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1445 98.9726 Least frequent 15 1.0274 Values occured only once 0 0.0000
========================================= BldgType ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 1220 83.5616 Least frequent 31 2.1233 Values occured only once 0 0.0000
========================================= HouseStyle ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 8 0.5479 Len 1460 NaN NA 0 0.0000 Most frequent 726 49.7260 Least frequent 8 0.5479 Values occured only once 0 0.0000
========================================= RoofStyle ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 1141 78.1507 Least frequent 9 0.6164 Values occured only once 0 0.0000
========================================= RoofMatl ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1434 98.2192 Least frequent 11 0.7534 Values occured only once 0 0.0000
========================================= Exterior1st ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 10 0.6849 Len 1460 NaN NA 0 0.0000 Most frequent 522 35.7534 Least frequent 20 1.3699 Values occured only once 0 0.0000
========================================= Exterior2nd ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 12 0.8219 Len 1460 NaN NA 0 0.0000 Most frequent 504 34.5205 Least frequent 10 0.6849 Values occured only once 0 0.0000
========================================= MasVnrType ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 864 59.1781 Least frequent 16 1.0959 Values occured only once 0 0.0000
========================================= ExterQual ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 906 62.0548 Least frequent 14 0.9589 Values occured only once 0 0.0000
========================================= ExterCond ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1286 88.0822 Least frequent 28 1.9178 Values occured only once 0 0.0000
========================================= Foundation ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 647 44.3151 Least frequent 9 0.6164 Values occured only once 0 0.0000
========================================= BsmtQual ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 663 45.4110 Least frequent 53 3.6301 Values occured only once 0 0.0000
========================================= BsmtCond ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1336 91.5068 Least frequent 58 3.9726 Values occured only once 0 0.0000
========================================= BsmtExposure ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 955 65.4110 Least frequent 116 7.9452 Values occured only once 0 0.0000
========================================= BsmtFinType1 ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 6 0.4110 Len 1460 NaN NA 0 0.0000 Most frequent 467 31.9863 Least frequent 74 5.0685 Values occured only once 0 0.0000
========================================= BsmtFinType2 ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 6 0.4110 Len 1460 NaN NA 0 0.0000 Most frequent 1293 88.5616 Least frequent 14 0.9589 Values occured only once 0 0.0000
========================================= Heating ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1428 97.8082 Least frequent 14 0.9589 Values occured only once 0 0.0000
========================================= HeatingQC ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 742 50.8219 Least frequent 49 3.3562 Values occured only once 0 0.0000
========================================= CentralAir ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1365 93.4932 Least frequent 95 6.5068 Values occured only once 0 0.0000
========================================= Electrical ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1339 91.7123 Least frequent 27 1.8493 Values occured only once 0 0.0000
========================================= KitchenQual ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 735 50.3425 Least frequent 39 2.6712 Values occured only once 0 0.0000
========================================= Functional ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 1366 93.5616 Least frequent 14 0.9589 Values occured only once 0 0.0000
========================================= FireplaceQu ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 701 48.0137 Least frequent 34 2.3288 Values occured only once 0 0.0000
========================================= GarageType ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 903 61.8493 Least frequent 11 0.7534 Values occured only once 0 0.0000
========================================= GarageFinish ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 676 46.3014 Least frequent 356 24.3836 Values occured only once 0 0.0000
========================================= GarageQual ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1329 91.0274 Least frequent 15 1.0274 Values occured only once 0 0.0000
========================================= GarageCond ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 1366 93.5616 Least frequent 10 0.6849 Values occured only once 0 0.0000
========================================= PavedDrive ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1340 91.7808 Least frequent 30 2.0548 Values occured only once 0 0.0000
========================================= PoolQC ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 865 59.2466 Least frequent 159 10.8904 Values occured only once 0 0.0000
========================================= Fence ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 688 47.1233 Least frequent 39 2.6712 Values occured only once 0 0.0000
========================================= MiscFeature ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1371 93.9041 Least frequent 10 0.6849 Values occured only once 0 0.0000
========================================= SaleType ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 1267 86.7808 Least frequent 28 1.9178 Values occured only once 0 0.0000
========================================= SaleCondition ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 1202 82.3288 Least frequent 12 0.8219 Values occured only once 0 0.0000
========================================= LotFrontage_NA_indicator ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1201 82.2603 Least frequent 259 17.7397 Values occured only once 0 0.0000
========================================= Alley_NA_indicator ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1369 93.7671 Least frequent 91 6.2329 Values occured only once 0 0.0000
========================================= MasVnrType_NA_indicator ========================================= ------------------------- This Columns is duplicate of <MasVnrArea_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1452 99.4521 Least frequent 8 0.5479 Values occured only once 0 0.0000
========================================= MasVnrArea_NA_indicator ========================================= ------------------------- This Columns is duplicate of <MasVnrType_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1452 99.4521 Least frequent 8 0.5479 Values occured only once 0 0.0000
========================================= BsmtQual_NA_indicator ========================================= ------------------------- This Columns is duplicate of <BsmtCond_NA_indicator> column ------------------------- This Columns is duplicate of <BsmtFinType1_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1423 97.4658 Least frequent 37 2.5342 Values occured only once 0 0.0000
========================================= BsmtCond_NA_indicator ========================================= ------------------------- This Columns is duplicate of <BsmtQual_NA_indicator> column ------------------------- This Columns is duplicate of <BsmtFinType1_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1423 97.4658 Least frequent 37 2.5342 Values occured only once 0 0.0000
========================================= BsmtExposure_NA_indicator ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1422 97.3973 Least frequent 38 2.6027 Values occured only once 0 0.0000
========================================= BsmtFinType1_NA_indicator ========================================= ------------------------- This Columns is duplicate of <BsmtQual_NA_indicator> column ------------------------- This Columns is duplicate of <BsmtCond_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1423 97.4658 Least frequent 37 2.5342 Values occured only once 0 0.0000
========================================= BsmtFinType2_NA_indicator ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1422 97.3973 Least frequent 38 2.6027 Values occured only once 0 0.0000
========================================= FireplaceQu_NA_indicator ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 770 52.7397 Least frequent 690 47.2603 Values occured only once 0 0.0000
========================================= GarageType_NA_indicator ========================================= ------------------------- This Columns is duplicate of <GarageYrBlt_NA_indicator> column ------------------------- This Columns is duplicate of <GarageFinish_NA_indicator> column ------------------------- This Columns is duplicate of <GarageQual_NA_indicator> column ------------------------- This Columns is duplicate of <GarageCond_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1379 94.4521 Least frequent 81 5.5479 Values occured only once 0 0.0000
========================================= GarageYrBlt_NA_indicator ========================================= ------------------------- This Columns is duplicate of <GarageType_NA_indicator> column ------------------------- This Columns is duplicate of <GarageFinish_NA_indicator> column ------------------------- This Columns is duplicate of <GarageQual_NA_indicator> column ------------------------- This Columns is duplicate of <GarageCond_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1379 94.4521 Least frequent 81 5.5479 Values occured only once 0 0.0000
========================================= GarageFinish_NA_indicator ========================================= ------------------------- This Columns is duplicate of <GarageType_NA_indicator> column ------------------------- This Columns is duplicate of <GarageYrBlt_NA_indicator> column ------------------------- This Columns is duplicate of <GarageQual_NA_indicator> column ------------------------- This Columns is duplicate of <GarageCond_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1379 94.4521 Least frequent 81 5.5479 Values occured only once 0 0.0000
========================================= GarageQual_NA_indicator ========================================= ------------------------- This Columns is duplicate of <GarageType_NA_indicator> column ------------------------- This Columns is duplicate of <GarageYrBlt_NA_indicator> column ------------------------- This Columns is duplicate of <GarageFinish_NA_indicator> column ------------------------- This Columns is duplicate of <GarageCond_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1379 94.4521 Least frequent 81 5.5479 Values occured only once 0 0.0000
========================================= GarageCond_NA_indicator ========================================= ------------------------- This Columns is duplicate of <GarageType_NA_indicator> column ------------------------- This Columns is duplicate of <GarageYrBlt_NA_indicator> column ------------------------- This Columns is duplicate of <GarageFinish_NA_indicator> column ------------------------- This Columns is duplicate of <GarageQual_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1379 94.4521 Least frequent 81 5.5479 Values occured only once 0 0.0000
========================================= Fence_NA_indicator ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1179 80.7534 Least frequent 281 19.2466 Values occured only once 0 0.0000
========================================= MiscFeature_NA_indicator ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1406 96.3014 Least frequent 54 3.6986 Values occured only once 0 0.0000
========================================= MSSubClass_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 14 0.9589 Len 1460 NaN NA 0 0.0000 Most frequent 540 36.9863 Least frequent 10 0.6849 Values occured only once 0 0.0000
========================================= OverallQual_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 8 0.5479 Len 1460 NaN NA 0 0.0000 Most frequent 402 27.5342 Least frequent 18 1.2329 Values occured only once 0 0.0000
========================================= OverallCond_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 7 0.4795 Len 1460 NaN NA 0 0.0000 Most frequent 827 56.6438 Least frequent 22 1.5068 Values occured only once 0 0.0000
========================================= LowQualFinSF_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1434 98.2192 Least frequent 26 1.7808 Values occured only once 0 0.0000
========================================= BsmtFullBath_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 857 58.6986 Least frequent 15 1.0274 Values occured only once 0 0.0000
========================================= BsmtHalfBath_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1380 94.5205 Least frequent 80 5.4795 Values occured only once 0 0.0000
========================================= FullBath_str ========================================= ------------------------- This Columns is duplicate of <FullBath> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 768 52.6027 Least frequent 9 0.6164 Values occured only once 0 0.0000
========================================= HalfBath_str ========================================= ------------------------- This Columns is duplicate of <HalfBath> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 913 62.5342 Least frequent 12 0.8219 Values occured only once 0 0.0000
========================================= BedroomAbvGr_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 6 0.4110 Len 1460 NaN NA 0 0.0000 Most frequent 804 55.0685 Least frequent 14 0.9589 Values occured only once 0 0.0000
========================================= KitchenAbvGr_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1395 95.5479 Least frequent 65 4.4521 Values occured only once 0 0.0000
========================================= TotRmsAbvGrd_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 10 0.6849 Len 1460 NaN NA 0 0.0000 Most frequent 404 27.6712 Least frequent 11 0.7534 Values occured only once 0 0.0000
========================================= Fireplaces_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 695 47.6027 Least frequent 115 7.8767 Values occured only once 0 0.0000
========================================= GarageCars_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 829 56.7808 Least frequent 81 5.5479 Values occured only once 0 0.0000
========================================= 3SsnPorch_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1436 98.3562 Least frequent 24 1.6438 Values occured only once 0 0.0000
========================================= MiscVal_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1408 96.4384 Least frequent 11 0.7534 Values occured only once 0 0.0000
========================================= MoSold_str ========================================= ------------------------- This Columns is duplicate of <MoSold> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 12 0.8219 Len 1460 NaN NA 0 0.0000 Most frequent 253 17.3288 Least frequent 52 3.5616 Values occured only once 0 0.0000
========================================= YrSold_str ========================================= ------------------------- This Columns is duplicate of <YrSold> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 338 23.1507 Least frequent 175 11.9863 Values occured only once 0 0.0000
========================================= MSSubClass ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.4076567471495591)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.0 100.00
NA 0.0 0.00
Mean 56.9 NaN
Std 42.3 NaN
Min 20.0 NaN
25% 20.0 NaN
50% 50.0 NaN
75% 70.0 NaN
Max 190.0 NaN
Nunique 15.0 1.03
Outlies 30.0 2.05
Nagetive 0.0 0.00
Zeros 0.0 0.00
========================================= LotFrontage ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 2.0120008521763144)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 70.75 NaN
Std 23.47 NaN
Min 21.00 NaN
25% 60.00 NaN
50% 70.00 NaN
75% 80.00 NaN
Max 313.00 NaN
Nunique 224.00 15.34
Outlies 14.00 0.96
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= LotArea ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 12.207687851233496)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 10516.83 NaN
Std 9981.26 NaN
Min 1300.00 NaN
25% 7553.50 NaN
50% 9478.50 NaN
75% 11601.50 NaN
Max 215245.00 NaN
Nunique 1073.00 73.49
Outlies 13.00 0.89
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= OverallQual ========================================= Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.2169439277628693)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 6.10 NaN
Std 1.38 NaN
Min 1.00 NaN
25% 5.00 NaN
50% 6.00 NaN
75% 7.00 NaN
Max 10.00 NaN
Nunique 10.00 0.68
Outlies 2.00 0.14
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= OverallCond ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.6930674724842182)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 5.58 NaN
Std 1.11 NaN
Min 1.00 NaN
25% 5.00 NaN
50% 5.00 NaN
75% 6.00 NaN
Max 9.00 NaN
Nunique 9.00 0.62
Outlies 28.00 1.92
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= YearBuilt ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: -0.613461172488183)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1971.27 NaN
Std 30.20 NaN
Min 1872.00 NaN
25% 1954.00 NaN
50% 1973.00 NaN
75% 2000.00 NaN
Max 2010.00 NaN
Nunique 112.00 7.67
Outlies 6.00 0.41
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= YearRemodAdd ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: -0.5035620027004709)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1984.87 NaN
Std 20.65 NaN
Min 1950.00 NaN
25% 1967.00 NaN
50% 1994.00 NaN
75% 2004.00 NaN
Max 2010.00 NaN
Nunique 61.00 4.18
Outlies 0.00 0.00
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= MasVnrArea ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 2.6682455485578593)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 103.84 NaN
Std 180.74 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 166.00 NaN
Max 1600.00 NaN
Nunique 335.00 22.95
Outlies 32.00 2.19
Nagetive 0.00 0.00
Zeros 861.00 58.97
========================================= BsmtFinSF1 ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.685503071910789)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 443.64 NaN
Std 456.10 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 383.50 NaN
75% 712.25 NaN
Max 5644.00 NaN
Nunique 637.00 43.63
Outlies 6.00 0.41
Nagetive 0.00 0.00
Zeros 467.00 31.99
========================================= BsmtFinSF2 ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 4.255261108933303)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 46.55 NaN
Std 161.32 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 1474.00 NaN
Nunique 144.00 9.86
Outlies 50.00 3.42
Nagetive 0.00 0.00
Zeros 1293.00 88.56
========================================= BsmtUnfSF ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.9202684528039037)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 567.24 NaN
Std 441.87 NaN
Min 0.00 NaN
25% 223.00 NaN
50% 477.50 NaN
75% 808.00 NaN
Max 2336.00 NaN
Nunique 780.00 53.42
Outlies 11.00 0.75
Nagetive 0.00 0.00
Zeros 118.00 8.08
========================================= TotalBsmtSF ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.5242545490627664)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1057.43 NaN
Std 438.71 NaN
Min 0.00 NaN
25% 795.75 NaN
50% 991.50 NaN
75% 1298.25 NaN
Max 6110.00 NaN
Nunique 721.00 49.38
Outlies 10.00 0.68
Nagetive 0.00 0.00
Zeros 37.00 2.53
========================================= 1stFlrSF ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.3767566220336365)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1162.63 NaN
Std 386.59 NaN
Min 334.00 NaN
25% 882.00 NaN
50% 1087.00 NaN
75% 1391.25 NaN
Max 4692.00 NaN
Nunique 753.00 51.58
Outlies 12.00 0.82
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= 2ndFlrSF ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.8130298163023265)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 346.99 NaN
Std 436.53 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 728.00 NaN
Max 2065.00 NaN
Nunique 417.00 28.56
Outlies 4.00 0.27
Nagetive 0.00 0.00
Zeros 829.00 56.78
========================================= LowQualFinSF ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 9.011341288465387)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 5.84 NaN
Std 48.62 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 572.00 NaN
Nunique 24.00 1.64
Outlies 20.00 1.37
Nagetive 0.00 0.00
Zeros 1434.00 98.22
========================================= GrLivArea ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.3665603560164552)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1515.46 NaN
Std 525.48 NaN
Min 334.00 NaN
25% 1129.50 NaN
50% 1464.00 NaN
75% 1776.75 NaN
Max 5642.00 NaN
Nunique 861.00 58.97
Outlies 16.00 1.10
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= BsmtFullBath ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.596066609663168)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 0.43 NaN
Std 0.52 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 1.00 NaN
Max 3.00 NaN
Nunique 4.00 0.27
Outlies 16.00 1.10
Nagetive 0.00 0.00
Zeros 856.00 58.63
========================================= BsmtHalfBath ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 4.103402697955168)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 0.06 NaN
Std 0.24 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 2.00 NaN
Nunique 3.00 0.21
Outlies 82.00 5.62
Nagetive 0.00 0.00
Zeros 1378.00 94.38
========================================= FullBath ========================================= ------------------------- This Columns is duplicate of <FullBath_str> column Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.036561558402727165)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1.57 NaN
Std 0.55 NaN
Min 0.00 NaN
25% 1.00 NaN
50% 2.00 NaN
75% 2.00 NaN
Max 3.00 NaN
Nunique 4.00 0.27
Outlies 0.00 0.00
Nagetive 0.00 0.00
Zeros 9.00 0.62
========================================= HalfBath ========================================= ------------------------- This Columns is duplicate of <HalfBath_str> column Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.675897448233722)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 0.38 NaN
Std 0.50 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 1.00 NaN
Max 2.00 NaN
Nunique 3.00 0.21
Outlies 12.00 0.82
Nagetive 0.00 0.00
Zeros 913.00 62.53
========================================= BedroomAbvGr ========================================= Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.21179009627507137)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 2.87 NaN
Std 0.82 NaN
Min 0.00 NaN
25% 2.00 NaN
50% 3.00 NaN
75% 3.00 NaN
Max 8.00 NaN
Nunique 8.00 0.55
Outlies 14.00 0.96
Nagetive 0.00 0.00
Zeros 6.00 0.41
========================================= KitchenAbvGr ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 4.488396777072859)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1.05 NaN
Std 0.22 NaN
Min 0.00 NaN
25% 1.00 NaN
50% 1.00 NaN
75% 1.00 NaN
Max 3.00 NaN
Nunique 4.00 0.27
Outlies 68.00 4.66
Nagetive 0.00 0.00
Zeros 1.00 0.07
========================================= TotRmsAbvGrd ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.6763408364355531)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 6.52 NaN
Std 1.63 NaN
Min 2.00 NaN
25% 5.00 NaN
50% 6.00 NaN
75% 7.00 NaN
Max 14.00 NaN
Nunique 12.00 0.82
Outlies 12.00 0.82
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= Fireplaces ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.6495651830548841)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 0.61 NaN
Std 0.64 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 1.00 NaN
75% 1.00 NaN
Max 3.00 NaN
Nunique 4.00 0.27
Outlies 5.00 0.34
Nagetive 0.00 0.00
Zeros 690.00 47.26
========================================= GarageYrBlt ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: -0.541264504372725)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1977.23 NaN
Std 24.78 NaN
Min 1900.00 NaN
25% 1960.00 NaN
50% 1978.00 NaN
75% 2001.00 NaN
Max 2010.00 NaN
Nunique 148.00 10.14
Outlies 1.00 0.07
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= GarageCars ========================================= Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: -0.3425489297486655)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1.77 NaN
Std 0.75 NaN
Min 0.00 NaN
25% 1.00 NaN
50% 2.00 NaN
75% 2.00 NaN
Max 4.00 NaN
Nunique 5.00 0.34
Outlies 0.00 0.00
Nagetive 0.00 0.00
Zeros 81.00 5.55
========================================= GarageArea ========================================= Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.17998090674623907)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 472.98 NaN
Std 213.80 NaN
Min 0.00 NaN
25% 334.50 NaN
50% 480.00 NaN
75% 576.00 NaN
Max 1418.00 NaN
Nunique 441.00 30.21
Outlies 7.00 0.48
Nagetive 0.00 0.00
Zeros 81.00 5.55
========================================= WoodDeckSF ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.5413757571931312)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 94.24 NaN
Std 125.34 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 168.00 NaN
Max 857.00 NaN
Nunique 274.00 18.77
Outlies 22.00 1.51
Nagetive 0.00 0.00
Zeros 761.00 52.12
========================================= OpenPorchSF ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 2.3643417403694404)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 46.66 NaN
Std 66.26 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 25.00 NaN
75% 68.00 NaN
Max 547.00 NaN
Nunique 202.00 13.84
Outlies 27.00 1.85
Nagetive 0.00 0.00
Zeros 656.00 44.93
========================================= EnclosedPorch ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 3.08987190371177)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 21.95 NaN
Std 61.12 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 552.00 NaN
Nunique 120.00 8.22
Outlies 51.00 3.49
Nagetive 0.00 0.00
Zeros 1252.00 85.75
========================================= 3SsnPorch ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 10.304342032693112)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 3.41 NaN
Std 29.32 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 508.00 NaN
Nunique 20.00 1.37
Outlies 23.00 1.58
Nagetive 0.00 0.00
Zeros 1436.00 98.36
========================================= ScreenPorch ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 4.122213743143115)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 15.06 NaN
Std 55.76 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 480.00 NaN
Nunique 76.00 5.21
Outlies 55.00 3.77
Nagetive 0.00 0.00
Zeros 1344.00 92.05
========================================= PoolArea ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 14.828373640750588)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 2.76 NaN
Std 40.18 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 738.00 NaN
Nunique 8.00 0.55
Outlies 7.00 0.48
Nagetive 0.00 0.00
Zeros 1453.00 99.52
========================================= MiscVal ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 24.476794188821916)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 43.49 NaN
Std 496.12 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 15500.00 NaN
Nunique 21.00 1.44
Outlies 8.00 0.55
Nagetive 0.00 0.00
Zeros 1408.00 96.44
========================================= MoSold ========================================= ------------------------- This Columns is duplicate of <MoSold_str> column Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.21205298505146022)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 6.32 NaN
Std 2.70 NaN
Min 1.00 NaN
25% 5.00 NaN
50% 6.00 NaN
75% 8.00 NaN
Max 12.00 NaN
Nunique 12.00 0.82
Outlies 0.00 0.00
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= YrSold ========================================= ------------------------- This Columns is duplicate of <YrSold_str> column Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.09626851386568028)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 2007.82 NaN
Std 1.33 NaN
Min 2006.00 NaN
25% 2007.00 NaN
50% 2008.00 NaN
75% 2009.00 NaN
Max 2010.00 NaN
Nunique 5.00 0.34
Outlies 0.00 0.00
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= SalePrice ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.8828757597682129)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.0 100.00
NA 0.0 0.00
Mean 180921.2 NaN
Std 79442.5 NaN
Min 34900.0 NaN
25% 129975.0 NaN
50% 163000.0 NaN
75% 214000.0 NaN
Max 755000.0 NaN
Nunique 663.0 45.41
Outlies 22.0 1.51
Nagetive 0.0 0.00
Zeros 0.0 0.00
---------------------------------------------------------------------------------------------- ****************************************** Modeling ****************************************** -------------------- This is Regression problem -------------------- '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' ------------------------------------- Linear Regression ------------------------------------- ------------------------- NOTE: This summary saved as <summary_OLS_1.csv> ------------------------- Variable coef std err t P>|t| [0.025 0.975] Indicator 0 1stFlrSF -7.8163 12.613 -0.620 0.536 -32.573 16.941 1 2ndFlrSF -2.3219 12.210 -0.190 0.849 -26.288 21.644 2 3SsnPorch 23.3036 71.452 0.326 0.744 -116.945 163.552 3 3SsnPorch_str__"0.0" -4718.8306 7531.629 -0.627 0.531 -19500.000 10100.000 4 3SsnPorch_str__Rare cases 4719.2140 7531.601 0.627 0.531 -10100.000 19500.000 5 Alley_NA_indicator__Missing 1881.7931 2333.896 0.806 0.420 -2699.272 6462.858 6 Alley_NA_indicator__Not missing -1881.4097 2333.833 -0.806 0.420 -6462.351 2699.531 7 Alley__Grvl -1980.6575 1611.039 -1.229 0.219 -5142.870 1181.555 8 Alley__Pave 1981.0409 1611.038 1.230 0.219 -1181.168 5143.250 9 BedroomAbvGr -2580.5902 4887.255 -0.528 0.598 -12200.000 7012.310 10 BedroomAbvGr_str__"1.0" 3915.4579 11500.000 0.339 0.735 -18700.000 26600.000 11 BedroomAbvGr_str__"2.0" -1833.5954 6657.038 -0.275 0.783 -14900.000 11200.000 12 BedroomAbvGr_str__"3.0" -1220.4947 3655.794 -0.334 0.739 -8396.235 5955.245 13 BedroomAbvGr_str__"4.0" 8819.3901 5289.337 1.667 0.096 -1562.733 19200.000 . 14 BedroomAbvGr_str__"5.0" -16250.0000 11200.000 -1.455 0.146 -38200.000 5678.966 15 BedroomAbvGr_str__Rare cases 6573.8779 11600.000 0.569 0.570 -16100.000 29300.000 16 BldgType__1Fam -657.1251 20200.000 -0.032 0.974 -40400.000 39000.000 17 BldgType__2fmCon -9951.2253 51800.000 -0.192 0.848 -112000.000 91800.000 18 BldgType__Duplex 1505.5745 7519.922 0.200 0.841 -13300.000 16300.000 19 BldgType__Twnhs 3111.9733 19300.000 0.162 0.872 -34700.000 40900.000 20 BldgType__TwnhsE 5991.1861 18700.000 0.321 0.748 -30600.000 42600.000 21 BsmtCond_NA_indicator__Missing -8291.8782 7222.467 -1.148 0.251 -22500.000 5884.670 22 BsmtCond_NA_indicator__Not missing 8292.2616 7222.380 1.148 0.251 -5884.115 22500.000 23 BsmtCond__Fa -3676.5892 3877.294 -0.948 0.343 -11300.000 3933.919 24 BsmtCond__Gd -1442.9844 3474.349 -0.415 0.678 -8262.577 5376.608 25 BsmtCond__TA 5119.9570 2415.074 2.120 0.034 379.554 9860.360 * 26 BsmtExposure_NA_indicator__Missing -1671.5152 13300.000 -0.126 0.900 -27800.000 24500.000 27 BsmtExposure_NA_indicator__Not missing 1671.8987 13300.000 0.126 0.900 -24500.000 27800.000 28 BsmtExposure__Av -5300.5227 2173.416 -2.439 0.015 -9566.592 -1034.454 * 29 BsmtExposure__Gd 14300.0000 3081.315 4.642 0.000 8256.641 20400.000 *** 30 BsmtExposure__Mn -849.4240 2617.012 -0.325 0.746 -5986.200 4287.352 31 BsmtExposure__No -8154.4394 1850.988 -4.405 0.000 -11800.000 -4521.246 *** 32 BsmtFinSF1 -9.1833 5.109 -1.797 0.073 -19.211 0.845 . 33 BsmtFinSF2 19.6292 12.913 1.520 0.129 -5.716 44.975 34 BsmtFinType1_NA_indicator__Missing -8291.8782 7222.467 -1.148 0.251 -22500.000 5884.670 35 BsmtFinType1_NA_indicator__Not missing 8292.2616 7222.380 1.148 0.251 -5884.115 22500.000 36 BsmtFinType1__ALQ 1955.8832 2450.428 0.798 0.425 -2853.914 6765.681 37 BsmtFinType1__BLQ 93.8689 2734.221 0.034 0.973 -5272.969 5460.707 38 BsmtFinType1__GLQ 5177.3103 2635.278 1.965 0.050 4.681 10300.000 . 39 BsmtFinType1__LwQ -3824.2204 3889.441 -0.983 0.326 -11500.000 3810.130 40 BsmtFinType1__Rec 1777.4885 2874.065 0.618 0.536 -3863.841 7418.818 41 BsmtFinType1__Unf -5179.9471 2693.871 -1.923 0.055 -10500.000 107.691 . 42 BsmtFinType2_NA_indicator__Missing 16890.0000 14800.000 1.142 0.254 -12100.000 45900.000 43 BsmtFinType2_NA_indicator__Not missing -16890.0000 14800.000 -1.142 0.254 -45900.000 12100.000 44 BsmtFinType2__ALQ 4078.9312 7352.669 0.555 0.579 -10400.000 18500.000 45 BsmtFinType2__BLQ -5019.9093 5985.909 -0.839 0.402 -16800.000 6729.474 46 BsmtFinType2__GLQ -3235.8921 9021.926 -0.359 0.720 -20900.000 14500.000 47 BsmtFinType2__LwQ -3805.2154 5013.541 -0.759 0.448 -13600.000 6035.563 48 BsmtFinType2__Rec 1355.6873 4944.827 0.274 0.784 -8350.217 11100.000 49 BsmtFinType2__Unf 6626.7817 5153.801 1.286 0.199 -3489.306 16700.000 50 BsmtFullBath -2399.8645 11500.000 -0.209 0.834 -24900.000 20100.000 51 BsmtFullBath_str__"0.0" -455.1750 12500.000 -0.036 0.971 -25000.000 24100.000 52 BsmtFullBath_str__"1.0" 6997.9531 5080.298 1.377 0.169 -2973.859 17000.000 53 BsmtFullBath_str__"2.0" -6542.3947 15200.000 -0.429 0.668 -36400.000 23400.000 54 BsmtHalfBath -3826.3929 19800.000 -0.193 0.847 -42800.000 35100.000 55 BsmtHalfBath_str__"0.0" -4964.3481 10100.000 -0.491 0.623 -24800.000 14900.000 56 BsmtHalfBath_str__"1.0" 4964.7315 10100.000 0.491 0.623 -14900.000 24800.000 57 BsmtQual_NA_indicator__Missing -8291.8782 7222.467 -1.148 0.251 -22500.000 5884.670 58 BsmtQual_NA_indicator__Not missing 8292.2616 7222.380 1.148 0.251 -5884.115 22500.000 59 BsmtQual__Ex 10970.0000 4052.081 2.708 0.007 3017.628 18900.000 ** 60 BsmtQual__Fa -3390.8948 4623.048 -0.733 0.463 -12500.000 5683.409 61 BsmtQual__Gd -1835.0804 2557.445 -0.718 0.473 -6854.935 3184.775 62 BsmtQual__TA -5744.8567 2565.290 -2.239 0.025 -10800.000 -709.602 * 63 BsmtUnfSF -7.9243 5.422 -1.462 0.144 -18.566 2.718 64 CentralAir__N 1363.9873 2766.086 0.493 0.622 -4065.397 6793.371 65 CentralAir__Y -1363.6039 2765.951 -0.493 0.622 -6792.724 4065.516 66 Condition1__Artery 501.0982 5408.759 0.093 0.926 -10100.000 11100.000 67 Condition1__Feedr -1991.9091 4393.967 -0.453 0.650 -10600.000 6632.746 68 Condition1__Norm 10210.0000 2900.867 3.520 0.000 4518.318 15900.000 *** 69 Condition1__PosN -11100.0000 7577.986 -1.464 0.143 -26000.000 3777.323 70 Condition1__RRAe -18080.0000 8623.946 -2.097 0.036 -35000.000 -1157.538 * 71 Condition1__RRAn 10990.0000 6214.327 1.769 0.077 -1204.244 23200.000 . 72 Condition1__Rare cases 9467.4701 8345.441 1.134 0.257 -6913.296 25800.000 73 Condition2__Norm 15100.0000 5834.151 2.589 0.010 3651.727 26600.000 * 74 Condition2__Rare cases -15100.0000 5834.273 -2.589 0.010 -26600.000 -3651.105 * 75 Electrical__FuseA 1414.5526 3498.353 0.404 0.686 -5452.154 8281.259 76 Electrical__FuseF -209.0680 5185.263 -0.040 0.968 -10400.000 9968.775 77 Electrical__SBrkr -1205.1012 3226.098 -0.374 0.709 -7537.416 5127.214 78 EnclosedPorch 11.4234 17.562 0.650 0.516 -23.047 45.894 79 ExterCond__Fa 1309.2522 5044.204 0.260 0.795 -8591.713 11200.000 80 ExterCond__Gd -616.5853 3258.776 -0.189 0.850 -7013.042 5779.871 81 ExterCond__TA -692.2834 2715.557 -0.255 0.799 -6022.487 4637.920 82 ExterQual__Ex 3422.9907 6472.389 0.529 0.597 -9281.274 16100.000 83 ExterQual__Fa -435.9133 9267.526 -0.047 0.962 -18600.000 17800.000 84 ExterQual__Gd -639.7592 4011.968 -0.159 0.873 -8514.612 7235.093 85 ExterQual__TA -2346.9347 3962.920 -0.592 0.554 -10100.000 5431.645 86 Exterior1st__AsbShng -732.2229 12600.000 -0.058 0.954 -25500.000 24000.000 87 Exterior1st__BrkFace 8016.9312 7249.128 1.106 0.269 -6211.948 22200.000 88 Exterior1st__CemntBd 29630.0000 20900.000 1.416 0.157 -11400.000 70700.000 89 Exterior1st__HdBoard -1485.4158 6579.234 -0.226 0.821 -14400.000 11400.000 90 Exterior1st__MetalSd -5066.7012 12800.000 -0.397 0.692 -30100.000 20000.000 91 Exterior1st__Plywood 1509.9516 6500.300 0.232 0.816 -11200.000 14300.000 92 Exterior1st__Stucco -16550.0000 13000.000 -1.274 0.203 -42100.000 8950.587 93 Exterior1st__VinylSd 1102.5268 8812.046 0.125 0.900 -16200.000 18400.000 94 Exterior1st__Wd Sdng -10890.0000 5383.766 -2.022 0.043 -21500.000 -320.430 * 95 Exterior1st__WdShing -5537.2482 8052.948 -0.688 0.492 -21300.000 10300.000 96 Exterior2nd__AsbShng -482.0447 12700.000 -0.038 0.970 -25400.000 24500.000 97 Exterior2nd__BrkFace 3891.3470 8948.807 0.435 0.664 -13700.000 21500.000 98 Exterior2nd__CmentBd -26980.0000 21700.000 -1.244 0.214 -69600.000 15600.000 99 Exterior2nd__HdBoard 2680.0019 6491.192 0.413 0.680 -10100.000 15400.000 100 Exterior2nd__ImStucc 2302.0800 13600.000 0.169 0.865 -24400.000 29000.000 101 Exterior2nd__MetalSd 5965.1701 13200.000 0.453 0.651 -19900.000 31800.000 102 Exterior2nd__Plywood -791.6577 5758.935 -0.137 0.891 -12100.000 10500.000 103 Exterior2nd__Rare cases -9567.5077 10100.000 -0.950 0.342 -29300.000 10200.000 104 Exterior2nd__Stucco 8630.7279 13400.000 0.643 0.521 -17700.000 35000.000 105 Exterior2nd__VinylSd 2886.0342 8742.957 0.330 0.741 -14300.000 20000.000 106 Exterior2nd__Wd Sdng 14370.0000 5340.237 2.691 0.007 3887.767 24900.000 ** 107 Exterior2nd__Wd Shng -2902.1224 6624.503 -0.438 0.661 -15900.000 10100.000 108 Fence_NA_indicator__Missing -499.4858 1289.104 -0.387 0.699 -3029.792 2030.820 109 Fence_NA_indicator__Not missing 499.8693 1289.178 0.388 0.698 -2030.582 3030.320 110 Fence__GdPrv -2566.9148 2328.740 -1.102 0.271 -7137.860 2004.030 111 Fence__GdWo 2378.2791 2266.285 1.049 0.294 -2070.076 6826.635 112 Fence__MnPrv -479.1651 1995.239 -0.240 0.810 -4395.500 3437.170 113 Fence__MnWw 668.1842 4528.590 0.148 0.883 -8220.714 9557.083 114 FireplaceQu_NA_indicator__Missing -783.3880 474.874 -1.650 0.099 -1715.489 148.713 . 115 FireplaceQu_NA_indicator__Not missing 783.7714 474.657 1.651 0.099 -147.904 1715.447 . 116 FireplaceQu__Ex 2686.9854 4582.901 0.586 0.558 -6308.517 11700.000 117 FireplaceQu__Fa -2358.3501 3089.400 -0.763 0.445 -8422.349 3705.648 118 FireplaceQu__Gd -1545.6113 1986.968 -0.778 0.437 -5445.711 2354.489 119 FireplaceQu__Po 1266.0200 3601.625 0.352 0.725 -5803.394 8335.434 120 FireplaceQu__TA -48.6606 2042.791 -0.024 0.981 -4058.333 3961.012 121 Fireplaces -19080.0000 5204.562 -3.667 0.000 -29300.000 -8868.198 *** 122 Fireplaces_str__"0.0" -24120.0000 5217.722 -4.622 0.000 -34400.000 -13900.000 *** 123 Fireplaces_str__"1.0" -2681.0305 1449.449 -1.850 0.065 -5526.066 164.005 . 124 Fireplaces_str__"2.0" 26800.0000 5579.261 4.803 0.000 15800.000 37700.000 *** 125 Foundation__BrkTil -5561.9282 4579.202 -1.215 0.225 -14600.000 3426.313 126 Foundation__CBlock 339.8427 4024.215 0.084 0.933 -7559.047 8238.733 127 Foundation__PConc 2334.4800 4276.886 0.546 0.585 -6060.364 10700.000 128 Foundation__Rare cases 4020.6915 11000.000 0.364 0.716 -17700.000 25700.000 129 Foundation__Slab -1132.7025 9713.851 -0.117 0.907 -20200.000 17900.000 130 FullBath 13410.0000 6671.404 2.010 0.045 313.905 26500.000 * 131 FullBath_str__"1.0" -7851.7460 9063.448 -0.866 0.387 -25600.000 9938.354 132 FullBath_str__"2.0" -14840.0000 4550.693 -3.261 0.001 -23800.000 -5906.026 ** 133 FullBath_str__"3.0" 16980.0000 4299.205 3.949 0.000 8540.403 25400.000 *** 134 FullBath_str__Rare cases 5711.3818 9129.920 0.626 0.532 -12200.000 23600.000 135 Functional__Maj1 -10650.0000 8538.039 -1.247 0.213 -27400.000 6109.843 136 Functional__Min1 3364.9980 5772.905 0.583 0.560 -7966.291 14700.000 137 Functional__Min2 -2164.6055 5768.628 -0.375 0.708 -13500.000 9158.290 138 Functional__Mod -1609.9158 8617.959 -0.187 0.852 -18500.000 15300.000 139 Functional__Typ 11060.0000 3799.584 2.911 0.004 3600.893 18500.000 ** 140 GarageArea -10.3409 10.614 -0.974 0.330 -31.175 10.493 141 GarageCars 11020.0000 9096.434 1.212 0.226 -6832.742 28900.000 142 GarageCars_str__"0.0" 727.5351 1661.179 0.438 0.662 -2533.094 3988.164 143 GarageCars_str__"1.0" -1816.1112 8809.166 -0.206 0.837 -19100.000 15500.000 144 GarageCars_str__"2.0" -5723.7661 1790.085 -3.197 0.001 -9237.416 -2210.116 ** 145 GarageCars_str__"3.0" 6812.7256 9890.979 0.689 0.491 -12600.000 26200.000 146 GarageCond_NA_indicator__Missing 727.5351 1661.179 0.438 0.662 -2533.094 3988.164 147 GarageCond_NA_indicator__Not missing -727.1516 1661.193 -0.438 0.662 -3987.808 2533.504 148 GarageCond__Fa 4076.2462 5771.281 0.706 0.480 -7251.856 15400.000 149 GarageCond__Gd -743.9386 8644.106 -0.086 0.931 -17700.000 16200.000 150 GarageCond__Po -8563.6419 11100.000 -0.769 0.442 -30400.000 13300.000 151 GarageCond__TA 5231.7177 5071.013 1.032 0.303 -4721.871 15200.000 152 GarageFinish_NA_indicator__Missing 727.5351 1661.179 0.438 0.662 -2533.094 3988.164 153 GarageFinish_NA_indicator__Not missing -727.1516 1661.193 -0.438 0.662 -3987.808 2533.504 154 GarageFinish__Fin 2417.1179 1754.427 1.378 0.169 -1026.542 5860.778 155 GarageFinish__RFn -1125.7719 1517.488 -0.742 0.458 -4104.359 1852.815 156 GarageFinish__Unf -1290.9626 1839.847 -0.702 0.483 -4902.288 2320.363 157 GarageQual_NA_indicator__Missing 727.5351 1661.179 0.438 0.662 -2533.094 3988.164 158 GarageQual_NA_indicator__Not missing -727.1516 1661.193 -0.438 0.662 -3987.808 2533.504 159 GarageQual__Fa -10120.0000 5087.144 -1.990 0.047 -20100.000 -139.028 * 160 GarageQual__Gd 12060.0000 7099.162 1.699 0.090 -1870.575 26000.000 . 161 GarageQual__TA -1939.2834 3991.229 -0.486 0.627 -9773.428 5894.862 162 GarageType_NA_indicator__Missing 727.5351 1661.179 0.438 0.662 -2533.094 3988.164 163 GarageType_NA_indicator__Not missing -727.1516 1661.193 -0.438 0.662 -3987.808 2533.504 164 GarageType__Attchd -4318.3019 3345.217 -1.291 0.197 -10900.000 2247.825 165 GarageType__Basment 7462.5291 7919.330 0.942 0.346 -8081.849 23000.000 166 GarageType__BuiltIn -9733.4806 4987.970 -1.951 0.051 -19500.000 57.108 . 167 GarageType__CarPort 6939.3664 8825.394 0.786 0.432 -10400.000 24300.000 168 GarageType__Detchd -349.7296 3505.461 -0.100 0.921 -7230.389 6530.930 169 GarageYrBlt -19.5433 83.201 -0.235 0.814 -182.853 143.766 170 GarageYrBlt_NA_indicator__Missing 727.5351 1661.179 0.438 0.662 -2533.094 3988.164 171 GarageYrBlt_NA_indicator__Not missing -727.1516 1661.193 -0.438 0.662 -3987.808 2533.504 172 GrLivArea 52.1632 12.109 4.308 0.000 28.394 75.932 *** 173 HalfBath -2935.4332 4681.450 -0.627 0.531 -12100.000 6253.505 174 HalfBath_str__"0.0" -1614.7854 1025.519 -1.575 0.116 -3627.715 398.144 175 HalfBath_str__"1.0" 6165.7708 4731.229 1.303 0.193 -3120.876 15500.000 176 HalfBath_str__"2.0" -4550.6020 4593.264 -0.991 0.322 -13600.000 4465.242 177 HeatingQC__Ex 1229.8420 2308.660 0.533 0.594 -3301.689 5761.372 178 HeatingQC__Fa 2064.9344 4576.387 0.451 0.652 -6917.781 11000.000 179 HeatingQC__Gd -2555.5386 2320.533 -1.101 0.271 -7110.375 1999.298 180 HeatingQC__TA -738.8543 2144.572 -0.345 0.731 -4948.306 3470.598 181 Heating__GasA 2346.0492 6025.947 0.389 0.697 -9481.922 14200.000 182 Heating__GasW 3507.0692 8154.142 0.430 0.667 -12500.000 19500.000 183 Heating__Rare cases -5852.7349 9018.722 -0.649 0.517 -23600.000 11800.000 184 HouseStyle__1.5Fin -8037.6487 11200.000 -0.715 0.475 -30100.000 14000.000 185 HouseStyle__1.5Unf 21640.0000 31000.000 0.698 0.485 -39200.000 82500.000 186 HouseStyle__1Story 90.0232 9919.235 0.009 0.993 -19400.000 19600.000 187 HouseStyle__2.5Unf -15140.0000 20000.000 -0.756 0.450 -54500.000 24200.000 188 HouseStyle__2Story -7982.6897 9562.751 -0.835 0.404 -26800.000 10800.000 189 HouseStyle__Rare cases -2978.5384 21500.000 -0.138 0.890 -45200.000 39300.000 190 HouseStyle__SFoyer 6426.7421 14300.000 0.449 0.653 -21700.000 34500.000 191 HouseStyle__SLvl 5982.3074 16400.000 0.366 0.715 -26100.000 38100.000 192 KitchenAbvGr 1312.1635 15200.000 0.087 0.931 -28400.000 31100.000 193 KitchenAbvGr_str__"1.0" 7027.1457 7991.732 0.879 0.379 -8659.346 22700.000 194 KitchenAbvGr_str__"2.0" -7026.7622 7991.677 -0.879 0.380 -22700.000 8659.622 195 KitchenQual__Ex 8612.0761 4329.703 1.989 0.047 113.562 17100.000 * 196 KitchenQual__Fa 2526.7026 5361.600 0.471 0.638 -7997.261 13100.000 197 KitchenQual__Gd -4334.4466 2642.029 -1.641 0.101 -9520.327 851.434 198 KitchenQual__TA -6803.9488 2567.172 -2.650 0.008 -11800.000 -1765.001 ** 199 LandContour__Bnk -15880.0000 4452.872 -3.566 0.000 -24600.000 -7139.337 *** 200 LandContour__HLS 4696.1417 4575.812 1.026 0.305 -4285.445 13700.000 201 LandContour__Low 5738.3916 5791.022 0.991 0.322 -5628.458 17100.000 202 LandContour__Lvl 5445.4619 3128.901 1.740 0.082 -696.071 11600.000 . 203 LandSlope__Gtl 2328.8313 6293.011 0.370 0.711 -10000.000 14700.000 204 LandSlope__Mod 7482.4480 6248.464 1.197 0.231 -4782.288 19700.000 205 LandSlope__Sev -9810.8959 11200.000 -0.878 0.380 -31700.000 12100.000 206 LotArea 0.6095 0.162 3.762 0.000 0.291 0.928 *** 207 LotConfig__Corner 1759.6378 2383.252 0.738 0.461 -2918.304 6437.580 208 LotConfig__CulDSac 10040.0000 3233.673 3.104 0.002 3691.383 16400.000 ** 209 LotConfig__FR2 -12470.0000 3986.806 -3.128 0.002 -20300.000 -4647.173 ** 210 LotConfig__Inside 674.8151 1951.121 0.346 0.730 -3154.923 4504.554 211 LotFrontage -138.7414 66.595 -2.083 0.038 -269.456 -8.026 * 212 LotFrontage_NA_indicator__Missing -689.2890 1334.840 -0.516 0.606 -3309.367 1930.789 213 LotFrontage_NA_indicator__Not missing 689.6725 1334.882 0.517 0.606 -1930.487 3309.831 214 LotShape__IR1 -951.5591 3460.914 -0.275 0.783 -7744.780 5841.662 215 LotShape__IR2 4690.0655 5215.157 0.899 0.369 -5546.454 14900.000 216 LotShape__IR3 -5727.1099 8509.849 -0.673 0.501 -22400.000 11000.000 217 LotShape__Reg 1988.9869 3481.516 0.571 0.568 -4844.672 8822.646 218 LowQualFinSF 62.3015 33.508 1.859 0.063 -3.468 128.071 . 219 LowQualFinSF_str__"0.0" 8232.4951 7632.733 1.079 0.281 -6749.339 23200.000 220 LowQualFinSF_str__Rare cases -8232.1117 7632.649 -1.079 0.281 -23200.000 6749.558 221 MSSubClass 209.0630 1128.542 0.185 0.853 -2006.085 2424.211 222 MSSubClass_str__"120.0" -19650.0000 43600.000 -0.451 0.652 -105000.000 65900.000 223 MSSubClass_str__"160.0" -41810.0000 87400.000 -0.478 0.632 -213000.000 130000.000 224 MSSubClass_str__"180.0" -53310.0000 110000.000 -0.484 0.628 -269000.000 163000.000 225 MSSubClass_str__"190.0" -9951.2253 51800.000 -0.192 0.848 -112000.000 91800.000 226 MSSubClass_str__"20.0" 32800.0000 73300.000 0.447 0.655 -111000.000 177000.000 227 MSSubClass_str__"30.0" 24350.0000 63600.000 0.383 0.702 -100000.000 149000.000 228 MSSubClass_str__"45.0" 5800.6073 56100.000 0.103 0.918 -104000.000 116000.000 229 MSSubClass_str__"50.0" 23670.0000 41300.000 0.573 0.567 -57400.000 105000.000 230 MSSubClass_str__"60.0" 8988.4997 29100.000 0.309 0.758 -48200.000 66100.000 231 MSSubClass_str__"70.0" 12490.0000 20400.000 0.612 0.541 -27500.000 52500.000 232 MSSubClass_str__"75.0" 11300.0000 25100.000 0.450 0.652 -37900.000 60600.000 233 MSSubClass_str__"80.0" 7779.0699 17800.000 0.438 0.661 -27100.000 42600.000 234 MSSubClass_str__"85.0" -3965.4521 15700.000 -0.253 0.801 -34800.000 26800.000 235 MSSubClass_str__"90.0" 1505.5745 7519.922 0.200 0.841 -13300.000 16300.000 236 MSZoning__C (all) -23410.0000 11800.000 -1.980 0.048 -46600.000 -201.719 * 237 MSZoning__FV 16100.0000 8190.902 1.965 0.050 20.411 32200.000 . 238 MSZoning__RH 6164.7116 8822.004 0.699 0.485 -11200.000 23500.000 239 MSZoning__RL 1938.3297 4593.380 0.422 0.673 -7077.741 11000.000 240 MSZoning__RM -791.2940 5033.480 -0.157 0.875 -10700.000 9088.623 241 MasVnrArea 18.7504 8.157 2.299 0.022 2.740 34.760 * 242 MasVnrArea_NA_indicator__Not missing 2507.7872 2941.608 0.853 0.394 -3266.120 8281.694 243 MasVnrArea_NA_indicator__Rare cases -2507.4037 2941.685 -0.852 0.394 -8281.462 3266.654 244 MasVnrType_NA_indicator__Not missing 2507.7872 2941.608 0.853 0.394 -3266.120 8281.694 245 MasVnrType_NA_indicator__Rare cases -2507.4037 2941.685 -0.852 0.394 -8281.462 3266.654 246 MasVnrType__BrkCmn -10310.0000 6544.107 -1.575 0.116 -23200.000 2540.033 247 MasVnrType__BrkFace 284.8077 2723.092 0.105 0.917 -5060.186 5629.802 248 MasVnrType__None 3521.8631 2964.651 1.188 0.235 -2297.273 9340.999 249 MasVnrType__Stone 6498.7159 3525.338 1.843 0.066 -420.960 13400.000 . 250 MiscFeature_NA_indicator__Missing -6605.3742 13900.000 -0.475 0.635 -33900.000 20700.000 251 MiscFeature_NA_indicator__Not missing 6605.7576 13900.000 0.475 0.635 -20700.000 33900.000 252 MiscFeature__Othr -4070.7299 11300.000 -0.361 0.718 -26200.000 18000.000 253 MiscFeature__Shed 7821.9288 6318.024 1.238 0.216 -4579.342 20200.000 254 MiscFeature__TenC -3750.8155 8016.606 -0.468 0.640 -19500.000 12000.000 255 MiscVal 0.8021 1.881 0.426 0.670 -2.891 4.495 256 MiscVal_str__"0.0" 9502.5133 18900.000 0.503 0.615 -27600.000 46600.000 257 MiscVal_str__"400.0" -6497.3462 11600.000 -0.562 0.574 -29200.000 16200.000 258 MiscVal_str__Rare cases -3004.7837 10600.000 -0.283 0.777 -23900.000 17900.000 259 MoSold -593.6744 365.863 -1.623 0.105 -1311.804 124.455 260 MoSold_str__"1.0" 4052.9206 3710.622 1.092 0.275 -3230.438 11300.000 261 MoSold_str__"10.0" -6586.0072 3430.105 -1.920 0.055 -13300.000 146.741 . 262 MoSold_str__"11.0" 3261.9881 3481.555 0.937 0.349 -3571.748 10100.000 263 MoSold_str__"12.0" 715.0770 3797.457 0.188 0.851 -6738.725 8168.879 264 MoSold_str__"2.0" -5780.0210 3780.408 -1.529 0.127 -13200.000 1640.315 265 MoSold_str__"3.0" -1027.2701 3180.705 -0.323 0.747 -7270.485 5215.945 266 MoSold_str__"4.0" 1219.9308 2875.211 0.424 0.671 -4423.648 6863.510 267 MoSold_str__"5.0" 425.0874 2399.001 0.177 0.859 -4283.769 5133.943 268 MoSold_str__"6.0" 688.8702 2196.517 0.314 0.754 -3622.542 5000.282 269 MoSold_str__"7.0" 4668.7214 2329.075 2.005 0.045 97.119 9240.324 * 270 MoSold_str__"8.0" -2323.3292 2879.895 -0.807 0.420 -7976.103 3329.445 271 MoSold_str__"9.0" 684.4155 3906.799 0.175 0.861 -6984.006 8352.837 272 Neighborhood__Blmngtn 8898.3208 10200.000 0.870 0.384 -11200.000 29000.000 273 Neighborhood__BrDale 851.7081 11600.000 0.073 0.942 -21900.000 23600.000 274 Neighborhood__BrkSide -5571.3563 6492.253 -0.858 0.391 -18300.000 7171.898 275 Neighborhood__ClearCr -5608.8279 8064.181 -0.696 0.487 -21400.000 10200.000 276 Neighborhood__CollgCr -2391.8826 3931.458 -0.608 0.543 -10100.000 5324.941 277 Neighborhood__Crawfor 16160.0000 6034.185 2.678 0.008 4316.191 28000.000 ** 278 Neighborhood__Edwards -22720.0000 4317.363 -5.262 0.000 -31200.000 -14200.000 *** 279 Neighborhood__Gilbert -5695.3204 5300.230 -1.075 0.283 -16100.000 4708.185 280 Neighborhood__IDOTRR -12570.0000 8575.214 -1.466 0.143 -29400.000 4257.291 281 Neighborhood__MeadowV -9615.6311 13100.000 -0.737 0.462 -35200.000 16000.000 282 Neighborhood__Mitchel -14520.0000 5634.512 -2.577 0.010 -25600.000 -3461.588 * 283 Neighborhood__NAmes -10310.0000 3703.522 -2.783 0.006 -17600.000 -3036.281 ** 284 Neighborhood__NWAmes -7765.5963 5043.373 -1.540 0.124 -17700.000 2133.739 285 Neighborhood__NoRidge 34920.0000 6782.590 5.148 0.000 21600.000 48200.000 *** 286 Neighborhood__NridgHt 25730.0000 5554.711 4.632 0.000 14800.000 36600.000 *** 287 Neighborhood__OldTown -13410.0000 6496.737 -2.064 0.039 -26200.000 -656.014 * 288 Neighborhood__Rare cases 11810.0000 12600.000 0.940 0.347 -12800.000 36500.000 289 Neighborhood__SWISU -24620.0000 8976.734 -2.743 0.006 -42200.000 -6999.900 ** 290 Neighborhood__Sawyer -6548.3699 4699.991 -1.393 0.164 -15800.000 2676.961 291 Neighborhood__SawyerW -1972.7708 4862.190 -0.406 0.685 -11500.000 7570.931 292 Neighborhood__Somerst -938.1490 8148.718 -0.115 0.908 -16900.000 15100.000 293 Neighborhood__StoneBr 43120.0000 7651.029 5.636 0.000 28100.000 58100.000 *** 294 Neighborhood__Timber -11260.0000 6507.294 -1.730 0.084 -24000.000 1516.271 . 295 Neighborhood__Veenker 14020.0000 9555.638 1.467 0.143 -4739.385 32800.000 296 OpenPorchSF 32.5969 17.460 1.867 0.062 -1.675 66.869 . 297 OverallCond 2023.2101 4604.864 0.439 0.661 -7015.402 11100.000 298 OverallCond_str__"3.0" -8761.9856 14600.000 -0.602 0.548 -37300.000 19800.000 299 OverallCond_str__"4.0" -6688.1319 10300.000 -0.652 0.515 -26800.000 13400.000 300 OverallCond_str__"5.0" -2302.9238 5764.646 -0.399 0.690 -13600.000 9012.154 301 OverallCond_str__"6.0" 1747.6884 2858.778 0.611 0.541 -3863.636 7359.013 302 OverallCond_str__"7.0" 7716.3977 5290.143 1.459 0.145 -2667.306 18100.000 303 OverallCond_str__"8.0" 1640.2865 10100.000 0.163 0.871 -18100.000 21400.000 304 OverallCond_str__"9.0" 6649.0521 15500.000 0.428 0.669 -23900.000 37200.000 305 OverallQual 7782.7842 6578.222 1.183 0.237 -5129.214 20700.000 306 OverallQual_str__"10.0" 14430.0000 25100.000 0.575 0.565 -34800.000 63700.000 307 OverallQual_str__"3.0" 795.1478 23300.000 0.034 0.973 -45000.000 46600.000 308 OverallQual_str__"4.0" -3258.6122 16700.000 -0.196 0.845 -36000.000 29500.000 309 OverallQual_str__"5.0" -8955.9951 10900.000 -0.825 0.410 -30300.000 12400.000 310 OverallQual_str__"6.0" -11320.0000 4852.299 -2.333 0.020 -20800.000 -1797.186 * 311 OverallQual_str__"7.0" -11790.0000 4441.861 -2.654 0.008 -20500.000 -3070.359 ** 312 OverallQual_str__"8.0" -4242.6962 10100.000 -0.419 0.675 -24100.000 15600.000 313 OverallQual_str__"9.0" 24350.0000 17600.000 1.381 0.168 -10300.000 59000.000 314 PavedDrive__N -204.0493 3672.084 -0.056 0.956 -7411.763 7003.664 315 PavedDrive__P 125.0322 4647.113 0.027 0.979 -8996.508 9246.572 316 PavedDrive__Y 79.4006 2987.590 0.027 0.979 -5784.761 5943.562 317 PoolArea -28.6800 24.519 -1.170 0.242 -76.807 19.446 318 PoolQC__Ex 804.5380 2509.551 0.321 0.749 -4121.310 5730.386 319 PoolQC__Fa 944.2044 2117.761 0.446 0.656 -3212.622 5101.031 320 PoolQC__Gd -1748.3590 1981.536 -0.882 0.378 -5637.797 2141.079 321 RoofMatl__CompShg -3657.8301 10400.000 -0.353 0.724 -24000.000 16700.000 322 RoofMatl__Rare cases 7365.9624 9714.479 0.758 0.449 -11700.000 26400.000 323 RoofMatl__Tar&Grv -3707.7489 17000.000 -0.217 0.828 -37200.000 29800.000 324 RoofStyle__Flat -1387.7548 19400.000 -0.071 0.943 -39500.000 36700.000 325 RoofStyle__Gable -5263.7560 6084.342 -0.865 0.387 -17200.000 6678.835 326 RoofStyle__Gambrel -4020.5639 10700.000 -0.375 0.708 -25100.000 17000.000 327 RoofStyle__Hip -5775.6298 6358.011 -0.908 0.364 -18300.000 6704.129 328 RoofStyle__Rare cases 16450.0000 11500.000 1.434 0.152 -6073.114 39000.000 329 SaleCondition__Abnorml -6090.0190 7035.650 -0.866 0.387 -19900.000 7719.837 330 SaleCondition__Alloca 4526.0453 12600.000 0.358 0.720 -20300.000 29300.000 331 SaleCondition__Family -11660.0000 8874.699 -1.314 0.189 -29100.000 5756.829 332 SaleCondition__Normal -170.0886 6524.633 -0.026 0.979 -13000.000 12600.000 333 SaleCondition__Partial 13400.0000 22800.000 0.588 0.557 -31400.000 58200.000 334 SaleType__COD -1857.5962 8478.234 -0.219 0.827 -18500.000 14800.000 335 SaleType__New -5453.7123 21600.000 -0.253 0.800 -47800.000 36900.000 336 SaleType__Rare cases 8174.6465 8940.149 0.914 0.361 -9373.437 25700.000 337 SaleType__WD -862.9546 7526.746 -0.115 0.909 -15600.000 13900.000 338 ScreenPorch 50.4601 17.299 2.917 0.004 16.505 84.415 ** 339 TotRmsAbvGrd 1821.4926 5343.766 0.341 0.733 -8667.466 12300.000 340 TotRmsAbvGrd_str__"10.0" 15580.0000 13900.000 1.118 0.264 -11800.000 42900.000 341 TotRmsAbvGrd_str__"11.0" -29510.0000 19800.000 -1.489 0.137 -68400.000 9382.096 342 TotRmsAbvGrd_str__"12.0" -12830.0000 24200.000 -0.531 0.595 -60200.000 34600.000 343 TotRmsAbvGrd_str__"3.0" -1891.5622 24200.000 -0.078 0.938 -49400.000 45600.000 344 TotRmsAbvGrd_str__"4.0" 3737.4070 18300.000 0.205 0.838 -32100.000 39600.000 345 TotRmsAbvGrd_str__"5.0" 5081.3684 12900.000 0.394 0.694 -20300.000 30400.000 346 TotRmsAbvGrd_str__"6.0" 4094.6722 7986.530 0.513 0.608 -11600.000 19800.000 347 TotRmsAbvGrd_str__"7.0" 4131.7814 3837.773 1.077 0.282 -3401.153 11700.000 348 TotRmsAbvGrd_str__"8.0" 4649.8681 4552.791 1.021 0.307 -4286.533 13600.000 349 TotRmsAbvGrd_str__"9.0" 6963.4045 9007.686 0.773 0.440 -10700.000 24600.000 350 TotalBsmtSF 2.5216 6.409 0.393 0.694 -10.059 15.102 351 WoodDeckSF 19.2837 8.125 2.373 0.018 3.336 35.232 * 352 YearBuilt 155.1413 118.273 1.312 0.190 -77.009 387.292 353 YearRemodAdd 89.5473 78.454 1.141 0.254 -64.445 243.540 354 YrSold -235.6229 153.221 -1.538 0.124 -536.372 65.126 355 YrSold_str__"2006.0" 1434.5202 1856.169 0.773 0.440 -2208.842 5077.883 356 YrSold_str__"2007.0" -2724.1054 1748.398 -1.558 0.120 -6155.932 707.721 357 YrSold_str__"2008.0" 371.3292 1767.629 0.210 0.834 -3098.244 3840.903 358 YrSold_str__"2009.0" 2697.8848 1705.531 1.582 0.114 -649.799 6045.569 359 YrSold_str__"2010.0" -1779.2455 2374.509 -0.749 0.454 -6440.028 2881.537 ------------------------- --- Model statistic --- R-squared : 0.918 Adj. R-squared : 0.892 F-statistic : 35 Prob (F-statistic): 0.0 No. Observations : 1095 AIC : 25567 Df Residuals : 826 BIC : 26911 RMSE (test) : 32029 ------------------------- Maximum correlation between Reseduals and any data columns is 9.164997151410843e-13, with columns <LotShape__IR2> Mean of train reseduals: -1.071393064576197e-08 ------------------------------------- Random Forest ------------------------------------- ------------------------- RF model peramters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': True, 'random_state': None, 'verbose': 0, 'warm_start': False} -------------------------
-------------------------
--- Model statistic ---
R^2 (test) : 0.8761020925815912
R^2 (train): 0.9811453430304609
RMSE (test): 29458
oob score : 0.85891384774956
-------------------------
Maximum correlation between Reseduals and any data columns is 0.3752565328359765, with columns <PoolArea>